What are the most memorable inaugurable address quotes that you can think of off the top of your head? Let me share two of my favorites and see if they are yours too.
“This great nation will endure as it has endured, will revive and will prosper. So, first of all, let me assert my firm belief that the only thing we have to fear is fear itself.” - Franklin D. Roosevelt, March 4, 1933.
“My fellow Americans: Ask not what your country can do for you - ask what you can do for your country. My fellow citizens of the world: Ask not what America will do for you, but what together we can do for the freedom of man.” - John F. Kennedy, Jan. 20, 1961.
Although there are a few very memorable, most inaugural addresses have been long forgotten. This note book will give us a chance to take a closer look at the inaugural speeches of the President of the U.S. (POTUS), but from a data scientic perspective. The scope of this analysis includes 58 Inaugural Speeches of 39 POTUS, which are scrapped from The American Presidentcy Project.
### Install and Load Libraries
# Packages that will be used for this notebook
packages.used=c("rvest", "xlsx", "tibble",
"tm", "tidytext", "dplyr",
"wordcloud", "qdap", "syuzhet",
"beeswarm", "RColorBrewer", "sentimentr",
"gplots", "factoextra", "MASS",
"scales", "RANN", "topicmodels")
# Check packages that need to be installed
packages.needed=setdiff(packages.used,
intersect(installed.packages()[,1],
packages.used))
# Install additional packages
if(length(packages.needed)>0){
install.packages(packages.needed, dependencies = TRUE,
repos='http://cran.us.r-project.org')
}
# Load libraries
library("rvest")
library("xlsx")
library("tibble")
library("tm")
library("tidytext")
library("MASS")
library("dplyr")
library("wordcloud")
# You may need to run
# sudo ln -f -s $(/usr/libexec/java_home)/jre/lib/server/libjvm.dylib /usr/local/lib
# in order to load qdap
#library(rJava)
library("qdap")
library("syuzhet")
library("beeswarm")
library("RColorBrewer")
library("gplots")
library("scales")
#library("sentimentr")
library("factoextra")
#library("RANN")
#library("topicmodels")
# Obtain speechFuncs functions from the website listed below
# 'speechFuncs' includes 'f.speechlinks', 'f.plotsent.len', and 'f.smooth.topic'
# https://github.com/TZstatsADS/ADS_Teaching/blob/master/Tutorials/wk2-TextMining/lib/speechFuncs.R
source("../lib/speechFuncs.R")
# Obtain plotstacked functions from the website listed below
# 'plotstacked' includes 'plot.stacked'
# https://github.com/TZstatsADS/ADS_Teaching/blob/master/Tutorials/wk2-TextMining/lib/plotstacked.R
#source("../lib/plotstacked.R")
This notebook was prepared with the following environmental settings.
print(R.version)
Current working director is
getwd()
### Data Harvest: Scrap URLs of Inaugural Speeches of POTUS and Read in Speeches
# Get speeches URLs
main.page <- read_html(x = "http://www.presidency.ucsb.edu/inaugurals.php")
# f.speechlinks is a function for extracting links from the list of speeches.
inaug.urls <- f.speechlinks(main.page)
# Remove the last line, irrelevant due to error.
inaug.urls <- inaug.urls[-nrow(inaug.urls),]
# Format Date
inaug.urls[,1] <- as.Date(inaug.urls[,1], format="%B %e, %Y")
# Read information file provided "../data/InaugurationInfo.xlsx"
file.path <- "../data/InaugurationInfo.xlsx"
inaug.info <- read.xlsx(file.path, sheetIndex = 1, stringsAsFactors = FALSE)
# Add "Date" and "URLs" columns to inaug.info, named as inaug.list
inaug.list <- add_column(inaug.info, Date = inaug.urls[,1], .after = 4)
inaug.list <- add_column(inaug.list, URLs = inaug.urls[,2])
# Update the "Words" variable for Trump's speech
inaug.list$Words[58] <- 1433
# Make "Words" as numeric variable
inaug.list$Words <- as.numeric(inaug.list$Words)
# Make name consistent
inaug.list$President[25] <- "Grover Cleveland"
inaug.list$President[27] <- "Grover Cleveland"
inaug.list$File[25] <- "GroverCleveland"
inaug.list$File[27] <- "GroverCleveland"
# Read in each speech
# Add and stage a 'Fulltext' column to inaug.list
inaug.list$Fulltext <- NA
# Indentify the folder "../data/InauguralSpeeches/"
# where to save each speech as individual '.txt' file
folder.path <- "../data/InauguralSpeeches/"
# Loop over each row in inaug.list
for(i in seq(nrow(inaug.list))) {
# Read speech from the URL and Store it in variable 'text'
text <- read_html(inaug.list$URLs[i]) %>% # Load the page
html_nodes(".displaytext") %>% # Isloate the text
html_text() # Get the text
# Update the 'Fulltext' column of inaug.list with speech text
inaug.list$Fulltext[i] <- text
# Create file name of each individual speech
filename <- paste0(folder.path,
"inaug",
inaug.list$File[i], "-",
inaug.list$Term[i], ".txt")
# Read text into the '.txt' file created
sink(file = filename) %>% # Open file to write
cat(text) # Write the file
sink() # Close the file
}
# Save the inaug.list as '.csv' file under folder "../output/"
write.csv(inaug.list, file = "../output/inaug_list.csv")
#summary(inaug.list)
The scope of this analysis includes 58 Inaugural Speeches of 39 POTUS, who are listed below.
unique(inaug.list$President)
all.File <- unique(inaug.list$File)
### Data Processing
# Stage a data.frame 'sentence.list'
sentence.list <- NULL
# Loop over each row in inaug.list
for(i in 1:nrow(inaug.list)){
# Split each speech into individual sentences
sentences <- sent_detect(inaug.list$Fulltext[i],
endmarks = c("?", ".", "!", "|",";"))
if(length(sentences)>0){
# Count the number of words in each sentence
word.count <- word_count(sentences)
# Calculate the presence of eight different emotions in each sentence
emotions <- get_nrc_sentiment(sentences)
#colnames(emotions)=paste0("emo.", colnames(emotions))
# Scale emotions by word.count
# In case the word counts are zeros, add 0.01 to word.count
emotions <- diag(1/(word.count+0.01))%*%as.matrix(emotions)
# Update sentence.list
# Columns include columns of inaug.list, senteces, word.count, emotions(10), sent.id
# Rows added while looping over each row in inaug.list
sentence.list <- rbind(sentence.list,
cbind(
# Columns of inaug.list
inaug.list[i,-ncol(inaug.list)],
# One column vector of sentences in individual speech
sentences=as.character(sentences),
# One column of word count of corresponding sentence
word.count,
# 8 emotions columns + 2 valence columns (positive/negative)
emotions,
# Assign consecutive id to each sentence
sent.id=1:length(sentences)
)
)
}
}
# Clean up sentence.list
# Some non-sentences exist in raw data due to erroneous extra end-of-sentence marks.
sentence.list <- sentence.list%>%filter(!is.na(word.count))
# assign.color
# Assign In.File a color correspond to the groups in In.Group
assign.color <- function(In.var, In.Group.var,
In.Palette.var="Set1", In.alpha.var=1){
# In.Group.var is a list of named lists.
col.use <- alpha(brewer.pal(length(In.Group.var),
In.Palette.var),
In.alpha.var)
for (i in 1:length(In.Group.var)){
if (In.var %in% unlist(In.Group.var[i])) {return(col.use[i])}
}
}
# assign.group
# Assign In.var to groups in In.Group.var
assign.group <- function(In.var, In.Group.var){
# In.Group.var is a list of named lists.
for (i in 1:length(In.Group.var)){
if (In.var %in% unlist(In.Group.var[i])) {return(names(In.Group.var)[i])}
}
}
# Speech.Length.Bar.Plot
# Create Bar Plot for every File in groups of In.Group
# In.Group is a list of named lists
# In.Term is a list of named lists
Speech.Length.Bar.Plot <- function(In.list=inaug.list,
In.Group,
In.Term=list("First Term"="1",
"Second Term"="2",
"Third Term"="3",
"Fourth Term"="4"),
In.by.Term=FALSE,
In.Palette="Set1", In.alpha=1){
var <- In.list%>%
filter(File%in%unlist(In.Group),
Term%in%unlist(In.Term))
var$File <- factor(var$File)
# var$FileOrdered <- reorder(var$File, var$Words, mean, order=T)
if (In.by.Term==FALSE) {
var$col.id <- unlist(lapply(var$File, assign.color, In.Group,
In.Palette, In.alpha))
legend.id <- names(In.Group)
}else{
var$col.id <- unlist(lapply(var$Term, assign.color, In.Term,
In.Palette, In.alpha))
legend.id <- names(In.Term)
}
x <- barplot(var$Words, space = 1,
col=var$col.id,
cex.axis = 0.7,
main = "Number of Words in a Speech"
)
text(cex=0.5, x=x-0.25, y=-1.25,
adj=1, srt = 60, xpd = TRUE,
labels = paste(substr(var$Date,1,4),
var$President))
legend("topright", inset= 0.02, legend =legend.id ,
text.col = brewer.pal(length(legend.id),
In.Palette)[1:length(legend.id)],
cex = 0.7, box.lty=2 )
}
# Speech.Length.Box.Plot
# Create Box Plot for groups in In.Group
# In.Group is a list of named lists
# In.Term is a list of named lists
Speech.Length.Box.Plot <- function(In.list=inaug.list,
In.Group,
In.Term=list("First Term"="1",
"Second Term"="2",
"Third Term"="3",
"Fourth Term"="4"),
In.by.Term=FALSE,
In.Palette="Set1", In.alpha=1){
var <- In.list%>%
filter(File%in%unlist(In.Group),
Term%in%unlist(In.Term))
var$File <- factor(var$File)
# var$FileOrdered <- reorder(var$File, var$Words, mean, order=T)
if (In.by.Term==FALSE) {
var$group.var <- unlist(lapply(var$File, assign.group, In.Group))
var$group.var <- factor(var$group.var, names(In.Group))
legend.id <- names(In.Group)
}else{
var$group.var <- unlist(lapply(var$Term, assign.group, In.Term))
var$group.var <- factor(var$group.var, names(In.Term))
legend.id <- names(In.Term)
}
boxplot(var$Words~var$group.var,
col=alpha(brewer.pal(length(legend.id),In.Palette),
In.alpha)[1:length(legend.id)],
cex.axis = 0.7,
main = "Number of Words in a Speech" )
}
# Bee.Swarm.Plot
# Create Bee Swarm Plot for every File in groups of In.Group
Bee.Swarm.Plot <- function(In.list=sentence.list, In.Group,
In.Palette="Set1", In.alpha=0.4){
var <- In.list%>%
filter(File%in%unlist(In.Group))
var$File <- factor(var$File)
var$FileOrdered <- reorder(var$File, var$word.count, mean, order=T)
for (i in 1:length(In.Group)){
if (i==1){
beeswarm(word.count~FileOrdered,
data = var%>%
filter(File%in%unlist(In.Group[i])),
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(length(In.Group), In.Palette)[i],
In.alpha),
cex=0.55, cex.axis=0.55, cex.lab=1,
spacing=5/nlevels(var$FileOrdered),
las=2, ylab="", xlab="",
main="Number of Words in a Sentence")
} else {
beeswarm(word.count~FileOrdered,
data = var%>%
filter(File%in%unlist(In.Group[i])),
add=TRUE,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(length(In.Group), In.Palette)[i],
In.alpha),
cex=0.55, cex.axis=0.55, cex.lab=1,
spacing=5/nlevels(var$FileOrdered)
)
}
}
legend("bottomright", inset= 0.02, legend =names(In.Group) ,
text.col = brewer.pal(length(In.Group),In.Palette)[1:length(In.Group)],
cex = 0.7, box.lty=2 )
}
# Bee.Swarm.Plot.Group
# Create Bee Swarm Plot for groups in In.Group
Bee.Swarm.Plot.Group <- function(In.list=sentence.list, In.Group,
In.Palette="Set1", In.alpha=0.4){
var <- In.list%>%
filter(File%in%unlist(In.Group))
var$File <- factor(var$File)
var$FileOrdered <- reorder(var$File, var$word.count, mean, order=T)
var$group.var <- unlist(lapply(var$File, assign.group, In.Group))
var$group.var <- factor(var$group.var, names(In.Group))
beeswarm(word.count~group.var,
data = var%>%filter(File%in%unlist(In.Group)),
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(length(In.Group),
In.Palette)[1:length(In.Group)],
In.alpha),
cex=0.55, cex.axis=0.55, cex.lab=1,
spacing=5/nlevels(var$FileOrdered),
las=2, ylab="", xlab="",
main="Number of Words in a Sentence")
legend("bottomright", inset=0.02, legend =names(In.Group) ,
text.col = brewer.pal(length(In.Group),In.Palette)[1:length(In.Group)],
cex = 0.7, box.lty=2 )
boxplot(var$word.count~var$group.var, horizontal=TRUE,
col="#0000ff22", axes=FALSE, add=TRUE)
}
# Sentence.Searcher.Group
# Genenrates shortest or longest sentences of groups in In.Group
# Generate longest sentences if In.Long=True
Sentence.Searcher.Group <- function(In.list=sentence.list,
In.Group,
In.Term=c("1","2","3","4"),
In.Long=TRUE,
In.min.words=3, In.n.sentences=5){
df <- c(seq(1:In.n.sentences))
for (i in 1:length(In.Group)){
var <- In.list%>%
filter(File%in%unlist(In.Group[i]),
Term%in%In.Term,
word.count>=In.min.words)%>%
arrange(desc(word.count))%>%
select(sentences)
if (In.Long==TRUE){
var <- head(var,In.n.sentences)
} else {
var <- tail(var,In.n.sentences)
}
if (nrow(var)!=0){
df<- cbind(df,var)
} else {
df <- cbind(df,c(rep("SYSTEM: No data available in table",In.n.sentences)))
}
}
df<- df[,-1]
colnames(df)<- names(In.Group)
df
}
# Word.Cloud.Group
# Creat Word Cloud for groups in In.Group
Word.Cloud.Group <- function(In.list=inaug.list, In.Group){
for (i in 1:length(In.Group)){
var <- In.list%>%
filter(File%in%unlist(In.Group[i]))
docs <- Corpus(VectorSource(var$Fulltext))
docs <- tm_map(docs, stripWhitespace) # Eliminate extra whitespace
docs <- tm_map(docs, content_transformer(tolower)) # Convert to lower case
docs <- tm_map(docs, removeWords, stopwords("english")) # Remove stopwords
docs <- tm_map(docs, removeWords, character(0))
docs <- tm_map(docs, removePunctuation) # Remove punctuations
#docs <- tm_map(docs, removeNumbers) # Remove numbers
#docs <- tm_map(docs, stemDocument) # Stem document
tdm <- TermDocumentMatrix(docs)
tdm.tidy <- tidy(tdm)
tdm.var <- summarise(group_by(tdm.tidy, term), sum(count))
pal <- c("Reds","Blues","Greens",
"Purples","Oranges", "Greys")
wordcloud(tdm.var$term, tdm.var$`sum(count)`,
scale=c(5,0.5),
max.words=100,
min.freq=1,
random.order=FALSE,
rot.per=0.3,
use.r.layout=T,
random.color=FALSE,
colors=brewer.pal(5, pal[i])
)
title(main = names(In.Group)[i],
cex.main=1.5,
col.main=brewer.pal(5, pal[i])[4]
)
}
}
# Sentence.Length.Sentimetal.Plot
# Create Plot of word count of every sentences in an individual speech
Sentence.Length.Sentimetal.Plot <- function(In.list=sentence.list, In.File, In.Term){
# Top Emotion Value
In.list$topemotion.v <- apply(select(In.list, anger:positive),
1, max)
temp <- In.list$topemotion.v
In.list$topemotion.v[temp<0.05] <- 1
# Top Emotion Location
In.list$topemotion <- apply(select(In.list, anger:positive),
1, which.max)
In.list$topemotion[In.list$topemotion.v<0.05] <- 0
In.list$topemotion <- In.list$topemotion + 1
# Filter and Select from In.list
df <- In.list%>%
filter(File==In.File,
Term==In.Term)%>%
select(sent.id, word.count,
topemotion, topemotion.v)
# Set color for the plot
col.use <- brewer.pal(10,"Set3")
ptcol.use <- alpha(col.use[df$topemotion],
sqrt(sqrt(sqrt(df$topemotion.v))))
# Plot
plot(df$sent.id, df$word.count,
col=ptcol.use,
type="h" #,ylim=c(-10, max(In.list$word.count))
)
title(main =paste(In.File, "Term", In.Term, sep=" "),
xlab = "Sentence ID", ylab = "Number of Words")
legend("topright", inset= 0.02,
legend =c("anger","anticipation","disgust", "fear", "joy",
"sadness","suprise","trust","negative","positive"),
text.col = brewer.pal(10,"Set3"),
cex = 0.6, box.lty=2 )
}
# Sentimental.Analysis.Plots
# Creat Heatmap for correlations and
# Barplot for average value of emotions for groups in In.Group
Sentimental.Analysis.Plots <- function(In.list=sentence.list, In.Group){
for (i in 1:length(In.Group)){
# heatmap.2(cor(In.list%>%
# filter(File%in%unlist(In.Group[i]))%>%
# select(anger:trust)),
# scale = "none",
# col = bluered(100), margin=c(6, 6), key=F,
# trace = "none", density.info = "none")
emo.means.var <- colMeans(In.list%>%
filter(File%in%unlist(In.Group[i]))%>%
select(anger:trust)>0.01)
barplot(emo.means.var[order(emo.means.var)],
las=2, col=brewer.pal(8,"Pastel2")[order(emo.means.var)],
horiz=T, main=names(In.Group)[i], xlab = "Average Value of Emotions")
}
}
# Sentimental.Sentence.Searcher
# Search for sentences that have the max value of each of the 8 emotions for groups in In.Group
Sentimental.Sentence.Searcher <- function(In.list=sentence.list,
In.Group,
In.Term=c("1","2","3","4"),
In.min.words=3){
df <- NULL
for (i in 1:length(In.Group)){
var <- In.list%>%
filter(File%in%unlist(In.Group[i]),
Term%in%In.Term,
word.count>=In.min.words)%>%
select(sentences:trust)
# var <- as.data.frame(var)
rst <- as.character(var$sentences[apply(var[,-(1:2)],2,which.max)])
if (nrow(var)!=0){
df<- cbind(df,rst)
} else {
df <- cbind(df,c(rep("SYSTEM: No data available in table",8)))
}
}
colnames(df)<- names(In.Group)
rownames(df) <- c("anger","anticipation","disgust","fear",
"joy","sadness","surprise","trust")
df
}
# Sentimental.Sentence.Kmeans
# Perform k-means clustering for a group In.group
Sentimental.Sentence.KMeans <- function(In.list=sentence.list,
In.group,
In.Term=c("1","2","3","4")){
var <- In.list%>%
filter(File%in%unlist(In.group),
Term%in%In.Term)%>%
group_by(File)%>%
summarise(anger=mean(anger),
anticipation=mean(anticipation),
disgust=mean(disgust),
fear=mean(fear),
joy=mean(joy),
sadness=mean(sadness),
surprise=mean(surprise),
trust=mean(trust)
#negative=mean(negative),
#positive=mean(positive)
)
var <- as.data.frame(var)
for(k in 2:9){
var[,k][is.na(var[,k])] = 0
}
rownames(var) <- as.character(var[,1])
presid.summary <- var
km.res <- kmeans(var[,-1], iter.max=200, 4)
return(list(km.res,presid.summary))
}
Groups of Interest
The following factors were considered in this analysis.
Types of Analysis
For each group of interest, we will perform the following analysis.
The scope of this analysis includes 58 Inaugural Speeches of 39 POTUS, which are scrapped from The American Presidentcy Project. We will perform some analysis over all these speeches hopefully to learn some new facts about them. We plan to find out the length of these speeches, the length of the sentences used in a speech, and the most commonly used words. Among all these speeches, five inaugural addresses were considered the best of all time, which are Thomas Jefferson’s 1st (1801), Abraham Lincoln’s 2nd (1865), Franklin Roosevelt’s 1st (1933), Franklin Roosevelt’s 2nd (1937), and John F. Kennedy’s (1961). We will take a closer look at them and see if we can find any interesting facts, similaries or differences, among these five speeches, which may contribute to the great success of them. In additionn, the most recent four POTUS may sound more familiar than others to our Millennials. So let’s take their speeches into our consideration as part of the the analysis of individual speecn as well.
Staging for Word Cloud
# Read in all the speeches
folder.path <- "../data/InauguralSpeeches/"
speeches <- list.files(path = folder.path, pattern = "*.txt")
#prex.out <- substr(speeches, 6, nchar(speeches)-4)
docs <- Corpus(DirSource(folder.path))
#docs <- Corpus(VectorSource(inaug.list$Fulltext))
# Clean the text document
docs <- tm_map(docs, stripWhitespace) # Eliminate extra whitespace
docs <- tm_map(docs, content_transformer(tolower)) # Convert to lower case
docs <- tm_map(docs, removeWords, stopwords("english")) # Remove stopwords
docs <- tm_map(docs, removeWords, character(0))
docs <- tm_map(docs, removePunctuation) # Remove punctuations
#docs <- tm_map(docs, removeNumbers) # Remove numbers
#docs <- tm_map(docs, stemDocument) # Stem document
tdm.all <- TermDocumentMatrix(docs)
tdm.tidy <- tidy(tdm.all)
tdm.overall <- summarise(group_by(tdm.tidy, term), sum(count))
# Generate `TF-IDF Weighted Document-Term Matrices`
dtm.all <- DocumentTermMatrix(docs,
control = list(weighting = function(x)
weightTfIdf(x,
normalize =FALSE),
stopwords = TRUE))
dtm.tidy <- tidy(dtm.all)
#dtm.all <- DocumentTermMatrix(docs)
# Convert rownames to filenames
#rownames(dtm.all) <- paste(corpus.list$File, corpus.list$Term,
# corpus.list$sent.id, sep="_")
#rowTotals <- apply(dtm.all , 1, sum) #Find the sum of words in each Document
#dtm.all <- dtm.all[rowTotals> 0, ]
#corpus.list <- corpus.list[rowTotals>0, ]
Staging for LDA Topic Modeling
# Create corpus list
corpus.list <- sentence.list[2:(nrow(sentence.list)-1), ]
sentence.pre <- sentence.list$sentences[1:(nrow(sentence.list)-2)]
sentence.post <- sentence.list$sentences[3:(nrow(sentence.list)-1)]
corpus.list$snipets <- paste(sentence.pre, corpus.list$sentences,
sentence.post, sep=" ")
rm.rows=(1:nrow(corpus.list))[corpus.list$sent.id==1]
rm.rows=c(rm.rows, rm.rows-1)
corpus.list=corpus.list[-rm.rows, ]
docs <- Corpus(VectorSource(corpus.list$snipets))
#writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
# Clean the text document
docs <- tm_map(docs, stripWhitespace) # Eliminate extra whitespace
docs <- tm_map(docs, content_transformer(tolower)) # Convert to lower case
docs <- tm_map(docs, removeWords, stopwords("english")) # Remove stopwords
docs <- tm_map(docs, removeWords, character(0))
docs <- tm_map(docs, removePunctuation) # Remove punctuations
docs <- tm_map(docs, removeNumbers) # Remove numbers
docs <- tm_map(docs, stemDocument) # Stem document
# Generate `TF-IDF Weighted Document-Term Matrices`
#dtm.all.lda <- DocumentTermMatrix(docs,
# control = list(weighting = function(x)
# weightTfIdf(x,
# normalize =FALSE),
# stopwords = TRUE))
dtm.all.lda <- DocumentTermMatrix(docs)
# Convert rownames to filenames
rownames(dtm.all.lda) <- paste(corpus.list$File, corpus.list$Term,
corpus.list$sent.id, sep="_")
#Find the sum of words in each Document
rowTotals <- apply(dtm.all.lda , 1, sum)
dtm.all.lda <- dtm.all.lda[rowTotals> 0, ]
corpus.list <- corpus.list[rowTotals>0, ]
Below is a summary of the length of all speeches.
summary(inaug.list$Words)
Min. 1st Qu. Median Mean 3rd Qu. Max.
135 1432 2084 2337 2894 8460
To get a better sense of the distribution of the number of words in speech, a box-and-whisker plot is created as below. As shown, the speeches of William H. Harrison and William H. Taft spoke were much longer than others.
# Generate bar plot of the length of speeches
bxpdat.speech <- boxplot(inaug.list$Words, xlab="Number of Words in a Speech")
text(x=1.2, y=bxpdat.speech$out,
labels=inaug.list$President[which(inaug.list$Words%in%bxpdat.speech$out)],
cex=0.7, col="blue")
speech.len.file <- inaug.list$File[order(inaug.list$Words, decreasing =TRUE)]
Below listed the top three POTUS who had the longest and shortest speeches respectively.
The top three POTUS who had the longest speeches are
head(speech.len.file,3)
[1] "WilliamHenryHarrison" "WilliamHowardTaft" "JamesKPolk"
The top three POTUS who had the shortest speeches are
tail(speech.len.file,3)
[1] "AbrahamLincoln" "FranklinDRoosevelt" "GeorgeWashington"
Below is a summary of the length of sentences used in an inaugural speech.
# Summary of the length of sentences
summary(sentence.list$word.count)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 13.00 20.00 23.92 31.00 124.00
William Henry Harrison had the longest sentence with 124 words in his speech. Below listed the top three shortest, which have been filter to be longer than three words, and the longest sentenses in all speeches.
sentence.len.file <- sentence.list%>%
filter(word.count>=3)%>%
arrange(desc(word.count))%>%
select(sentences)
tail(sentence.len.file,5)
head(sentence.len.file,5)
A box-and-whisker plot has been created below to help us get a better sense of the distribution of the number of words in a sentence.
word.count.ave <- sentence.list%>%group_by(President)%>%summarise(word.count=round(mean(word.count,0)))
bxpdat.sentence <- boxplot(word.count.ave$word.count,
xlab="Number of Words in a Sentence")
text(x=1.2, y=bxpdat.sentence$out, cex=0.5, col="blue")
The text with larger in size and darker in color as shown in the words cloud below are the words that appeared more often in all inaugural speeches. With no surprise , the most commonly used words in all speeches include “will”, “government”, and “people”.
set.seed(123)
wordcloud(tdm.overall$term, tdm.overall$`sum(count)`,
scale=c(5,0.5),
max.words=100,
min.freq=1,
random.order=FALSE,
rot.per=0.3,
use.r.layout=T,
random.color=FALSE,
colors=brewer.pal(5,"Blues")
)
title(main="What Are The Most Common Words in Inaugural Speeches?", cex.main=0.9)
An interactive wordcloud has been created to help us picture the most frequent words used in each individual speech. This could be used to find the wordcloud of each individual speech as well as to compare between two different speeches. We will use this interactive wordcloud to indentify the most commonly used words for the most famous iaugural speeches as well as the iaugural speeches given by the four most recent presidents.
library(shiny)
shinyApp(
ui = fluidPage(
fluidRow(style = "padding-bottom: 20px;",
column(4, selectInput('speech1', 'Speech 1',
speeches, selected=speeches[5])),
column(4, selectInput('speech2', 'Speech 2',
speeches, selected=speeches[9])),
column(4, sliderInput('nwords', 'Number of words', 3,
min = 20, max = 200, value=100, step = 20))
),
fluidRow(
plotOutput('wordclouds', height = "400px")
),
fluidRow(
actionButton("close","Close window",
onclick = "setTimeout(function(){window.close();},500);")
)
),
server = function(input, output, session) {
# Combine the selected variables into a new data frame
selectedData <- reactive({
list(dtm.term1=dtm.tidy$term[dtm.tidy$document==
as.character(which(speeches == input$speech1))],
dtm.count1=dtm.tidy$count[dtm.tidy$document==
as.character(which(speeches == input$speech1))],
dtm.term2=dtm.tidy$term[dtm.tidy$document==
as.character(which(speeches == input$speech2))],
dtm.count2=dtm.tidy$count[dtm.tidy$document==
as.character(which(speeches == input$speech2))])
})
output$wordclouds <- renderPlot(height = 400, {
par(mfrow=c(1,2), mar = c(0, 0, 3, 0))
wordcloud(selectedData()$dtm.term1,
selectedData()$dtm.count1,
scale=c(4,0.5),
max.words=input$nwords,
min.freq=1,
random.order=FALSE,
rot.per=0.3,
use.r.layout=T,
random.color=FALSE,
colors=brewer.pal(5,"Blues"),
main=input$speech1)
wordcloud(selectedData()$dtm.term2,
selectedData()$dtm.count2,
scale=c(4,0.5),
max.words=input$nwords,
min.freq=1,
random.order=FALSE,
rot.per=0.3,
use.r.layout=T,
random.color=FALSE,
colors=brewer.pal(5,"Blues"),
main=input$speech2)
})
observe({
if (input$close>0) stopApp()
# js$closeWindow()
})
},
options = list(height = 600)
)
For the five inaugural addresses which are considered the best across of all time, we can see how their emotion changed over the speech, as shown in the plot below.
par(mar=c(2,2,2,1), mfrow=c(7,1))
Sentence.Length.Sentimetal.Plot(sentence.list,"ThomasJefferson",1)
Sentence.Length.Sentimetal.Plot(sentence.list,"AbrahamLincoln",2)
Sentence.Length.Sentimetal.Plot(sentence.list,"FranklinDRoosevelt",1)
Sentence.Length.Sentimetal.Plot(sentence.list,"FranklinDRoosevelt",2)
Sentence.Length.Sentimetal.Plot(sentence.list,"JohnFKennedy",1)
For the most recent four POTUS, who may sound more familiar than others to our Millennials, let’s take a closer look at how their emotion changed across over the speeches. Below shows the analysis of speeches from both the first and second terms, while to note that President Trump is currently in his first term.
par(mar=c(2,2,2,1), mfrow=c(7,1))
Sentence.Length.Sentimetal.Plot(sentence.list,"WilliamJClinton",1)
Sentence.Length.Sentimetal.Plot(sentence.list,"WilliamJClinton",2)
Sentence.Length.Sentimetal.Plot(sentence.list,"GeorgeWBush",1)
Sentence.Length.Sentimetal.Plot(sentence.list,"GeorgeWBush",2)
Sentence.Length.Sentimetal.Plot(sentence.list,"BarackObama",1)
Sentence.Length.Sentimetal.Plot(sentence.list,"BarackObama",2)
Sentence.Length.Sentimetal.Plot(sentence.list,"DonaldJTrump",1)
# Run LDA (Latent Dirichlet allocation)
# Set parameters for Gibbs sampling
burnin <- 4000 # Drop the first 4000 samples
iter <- 2000
thin <- 500 # Pick only every 500 guesses
seed <-list(2003,5,63,100001,765)
nstart <- 5
best <- TRUE # Sample with the largest posterior distribution
k <- 8 # Number of topics
# Run LDA using Gibbs sampling
ldaOut.all <-LDA(dtm.all.lda, k, method="Gibbs", control=list(nstart=nstart,
seed = seed, best=best,
burnin = burnin, iter = iter,
thin=thin))
# Write out results
# Docs to topics
ldaOut.topics.all <- as.matrix(topics(ldaOut.all))
table(c(1:k, ldaOut.topics.all))
1 2 3 4 5 6 7 8
770 750 982 658 788 481 578 532
write.csv(ldaOut.topics.all,file=paste("../output/LDAGibbs ",
k," DocsToTopics.csv", sep = ''))
# Top 10 terms in each topic
ldaOut.terms.all <- as.matrix(terms(ldaOut.all,10))
write.csv(ldaOut.terms.all,file=paste("../output/LDAGibbs ",
k," TopicsToTerms.csv", sep = ''))
# Probabilities associated with each topic assignment
topicProbabilities.all <- as.data.frame(ldaOut.all@gamma)
write.csv(topicProbabilities.all,file=paste("../output/LDAGibbs ",
k," TopicProbabilities.csv", sep = ''))
terms.beta <- ldaOut.all@beta
terms.beta <- scale(terms.beta)
topics.terms <- NULL
for(i in 1:k){
topics.terms.all <- rbind(topics.terms,
ldaOut.all@terms[order(terms.beta[i,],
decreasing = TRUE)[1:10]])
}
#topics.terms.all
ldaOut.terms.all[1:3,]
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8
[1,] "govern" "war" "world" "law" "time" "will" "peopl" "countri"
[2,] "state" "great" "peac" "upon" "year" "nation" "free" "duti"
[3,] "power" "secur" "new" "shall" "now" "can" "nation" "may"
Based on the most popular terms and the most salient terms for each topic, we assign a hashtag to each topic, which are “Goverment”, “War”, “World”, “Law”, “Time”, “Nation”, “People”, are “Country”.
topics.hash <- c("Goverment", "War", "World", "Law",
"Time", "Nation", "People", "Country")
print(topics.hash)
[1] "Goverment" "War" "World" "Law" "Time" "Nation" "People"
[8] "Country"
corpus.list$ldatopic <- as.vector(ldaOut.topics.all)
corpus.list$ldahash <- topics.hash[ldaOut.topics.all]
colnames(topicProbabilities.all) <- topics.hash
corpus.list.df <- cbind(corpus.list, topicProbabilities.all)
We can use Word Cloud to visualize the top topics as shown below.
set.seed(123)
wordcloud(corpus.list.df$ldahash,
scale=c(5,0.5),
max.words=100,
min.freq=1,
random.order=FALSE,
rot.per=0.3,
use.r.layout=T,
random.color=FALSE,
colors=brewer.pal(5,"Blues")
)
Summary of Group of Interest
| Number of Terms Served | Number of Presidents |
|---|---|
| One Term | 22 |
| Two Terms | 16 |
| Four Terms | 1 |
This analysis has primarily focused on presidents who served one or two terms. The analysis of the speeches of Franklin Roosevelt, who was the only POTUS who served more than two terms, may be incorporated in the furture.
### Number of Terms Served
# Summary of groups by the number of terms served
#inaug.list%>%
# group_by(Term)%>%
# summarise(Count = n_distinct(President))
# Identify 'File' for each group
four.terms <- unique(inaug.list$File[inaug.list$Term>2])
twoplus.terms <- inaug.list$File[inaug.list$Term==2]
two.terms <- twoplus.terms[twoplus.terms != four.terms]
one.term <- inaug.list$File[!(inaug.list$File%in%twoplus.terms)]
# Create group and group name
TermGroup <- list("One Term"=one.term, "Two Terms"=two.terms, "Four Terms"=four.terms)
#TermGroup.name <- "Number of Terms Served"
POTUS are assigned into three groups by the number of terms they have served.
TermGroup$`One Term`
[1] "JohnAdams" "JohnQuincyAdams" "MartinvanBuren"
[4] "WilliamHenryHarrison" "JamesKPolk" "ZacharyTaylor"
[7] "FranklinPierce" "JamesBuchanan" "RutherfordBHayes"
[10] "JamesGarfield" "BenjaminHarrison" "TheodoreRoosevelt"
[13] "WilliamHowardTaft" "WarrenGHarding" "CalvinCoolidge"
[16] "HerbertHoover" "HarrySTruman" "JohnFKennedy"
[19] "LyndonBJohnson" "JimmyCarter" "GeorgeBush"
[22] "DonaldJTrump"
TermGroup$`Two Terms`
[1] "GeorgeWashington" "ThomasJefferson" "JamesMadison" "JamesMonroe"
[5] "AndrewJackson" "AbrahamLincoln" "UlyssesSGrant" "GroverCleveland"
[9] "WilliamMcKinley" "WoodrowWilson" "DwightDEisenhower" "RichardNixon"
[13] "RonaldReagan" "WilliamJClinton" "GeorgeWBush" "BarackObama"
TermGroup$`Four Terms`
[1] "FranklinDRoosevelt"
By looking the bar plots and box-whisker plots of the number of words used in a speech, we can see that the length of the inaugural speeches of the POTUS who served only one term is apparently longer than that of the other POTUS who served more than one term.
par(mar=c(4.5,3,2,3), mfrow=c(1,2))
# Create bar plot of each speech
Speech.Length.Bar.Plot(inaug.list,TermGroup)
# Create box plot of each speech by group
Speech.Length.Box.Plot(inaug.list,TermGroup)
For POTUS who served two terms, we compared the length of speeches of their first term and sencond term. As shown in the plots, the length of speeches of their two terms are generally consistent, except for that of Abraham Lincoln’s - Abraham Lincoln’s sencond term speepch is a lot shorter than his first.
par(mar=c(4.5,3,2,3), mfrow=c(1,2))
# Create bar plot of each speech
Speech.Length.Bar.Plot(inaug.list,TermGroup[2],
In.Term =list("First Term"="1",
"Second Term"="2"),
In.by.Term = TRUE)
# Create box plot of each speech by group
Speech.Length.Box.Plot(inaug.list,TermGroup[2],
In.Term = list("First Term"="1",
"Second Term"="2"),
In.by.Term = TRUE)
By looking the bee swarm plots and box-whisker plots of the number of words used in a sentence, we can see that the length of sentences used by POTUS who served only one term is generally a bit longer than those of other POTUS who served more than one term.
par(mar=c(4.5,5.5,2,1), mfrow=c(1,2))
Bee.Swarm.Plot(sentence.list,TermGroup)
Bee.Swarm.Plot.Group(sentence.list,TermGroup)
Below listed the shortest and longest sentenses used in these speeches for each goup.
Sentence.Searcher.Group(sentence.list,TermGroup,
In.Term=c("1","2","3","4"),In.Long=FALSE,
In.min.words=3, In.n.sentences=5)
Sentence.Searcher.Group(sentence.list,TermGroup,
In.Term=c("1","2","3","4"),In.Long=TRUE,
In.min.words=3, In.n.sentences=5)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,2))
# Create bee swarm plot for 'Two Terms' group
ii.var <- sentence.list%>%
filter(File%in%unlist(TermGroup[2]),
Term%in%c(1,2))
ii.var$File <- factor(ii.var$File)
ii.var$FileOrdered <- reorder(ii.var$File, ii.var$word.count, mean, order=T)
beeswarm(word.count~FileOrdered,
data = ii.var%>%filter(Term==1),
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9, "Blues")[4], 1),
cex=0.55, cex.axis=0.55, cex.lab=1,
spacing=5/nlevels(ii.var$FileOrdered),
las=2, ylab="", xlab="",
main="Number of Words in a Sentence")
beeswarm(word.count~FileOrdered,
data = ii.var%>%filter(Term==2),
add=TRUE,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9, "Blues")[7], 1),
cex=0.55, cex.axis=0.55, cex.lab=1,
spacing=5/nlevels(ii.var$FileOrdered))
legend("bottomright", inset= 0.02, legend =c("First Term", "Second Term"),
text.col = brewer.pal(9,"Blues")[c(4,7)],
cex = 0.7, box.lty=2 )
beeswarm(word.count~Term,
data = ii.var,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9,
"Blues")[c(4,7)],
0.4),
cex=0.55, cex.axis=0.55, cex.lab=1,
spacing=5/nlevels(ii.var$FileOrdered),
las=2, ylab="", xlab="",
main="Number of Words in a Sentence")
legend("bottomright", inset=0.02, legend =c("First Term", "Second Term") ,
text.col = brewer.pal(9,"Blues")[c(4,7)],
cex = 0.7, box.lty=2 )
boxplot(ii.var$word.count~ii.var$Term, horizontal=TRUE,
col="#0000ff22", axes=FALSE, add=TRUE)
For POTUS who served two terms, we compared speeches of their first term and sencond term. From the plots, we can see that speeches of their two terms are generally consistent. Below listed the shortest and longest sentenses in the speeches of the presidents who served two terms.
short.first <- Sentence.Searcher.Group(sentence.list,TermGroup,c("1"),FALSE)[,2]
short.second <- Sentence.Searcher.Group(sentence.list,TermGroup,c("2"),FALSE)[,2]
short.two.terms <- data.frame(short.first, short.second)
colnames(short.two.terms) <- c("First Term", "Second Term")
short.two.terms
long.first <- Sentence.Searcher.Group(sentence.list,TermGroup,c("1"),TRUE)[,2]
long.second <- Sentence.Searcher.Group(sentence.list,TermGroup,c("2"),TRUE)[,2]
long.two.terms <- data.frame(long.first, long.second)
colnames(long.two.terms) <- c("First Term", "Second Term")
long.two.terms
The text with larger in size and darker in color as shown in the words cloud below are the words that appeared more often in the speeches. As we can see, the most frequent words used by the POTUS of these three groups are the same, which inclue “will”, “people”, and “government”.
par(mar=c(1,1,1,1), mfrow=c(1,length(TermGroup)))
set.seed(123)
Word.Cloud.Group(inaug.list,TermGroup)
For POTUS who served two terms, as shown the in the word cloud below, the most frequently used words are very similar between their first term and sencond term.
par(mar=c(1,1,1,1), mfrow=c(1,2))
set.seed(123)
for (i in 1:2){
ii.var <- inaug.list%>%
filter(File%in%unlist(TermGroup[2]))
ii.docs <- Corpus(VectorSource(ii.var$Fulltext))
ii.docs <- tm_map(ii.docs, stripWhitespace) # Eliminate extra whitespace
ii.docs <- tm_map(ii.docs, content_transformer(tolower)) # Convert to lower case
ii.docs <- tm_map(ii.docs, removeWords, stopwords("english")) # Remove stopwords
ii.docs <- tm_map(ii.docs, removeWords, character(0))
ii.docs <- tm_map(ii.docs, removePunctuation) # Remove punctuations
#ii.docs <- tm_map(ii.docs, removeNumbers) # Remove numbers
#ii.docs <- tm_map(ii.docs, stemDocument) # Stem document
ii.tdm <- TermDocumentMatrix(ii.docs)
ii.tdm.tidy <- tidy(ii.tdm)
ii.tdm.var <- summarise(group_by(ii.tdm.tidy, term), sum(count))
wordcloud(ii.tdm.var$term, ii.tdm.var$`sum(count)`,
scale=c(5,0.5),
max.words=100,
min.freq=1,
random.order=FALSE,
rot.per=0.3,
use.r.layout=T,
random.color=FALSE,
colors=brewer.pal(5, "Blues")
)
title(main = c("First Term","Second Term")[i],
cex.main=1.5,
col.main=brewer.pal(5, "Blues")[4])
}
As shown in the bar plots below, the overall emotion pattern are very similar for these three groups.
par(mar=c(4.5,5.5,2,1), mfrow=c(length(TermGroup),1))
# Create barplot of average value of emotions for each group
Sentimental.Analysis.Plots(sentence.list,TermGroup)
We also compared the first term and second term speeches of the POTUS who served two terms. As shown below, the fear level increases but the sadness level decreases from the first term to the second term, while all other emotions seemed at the same level as the the first term.
par(mar=c(4.5,5.5,2,1), mfrow=c(2,1))
# Creat Heatmap for correlations and
# Barplot for average value of emotions for groups in In.Group
# heatmap.2(cor(sentence.list%>%
# filter(File%in%unlist(TermGroup[2]))%>%
# select(anger:trust)),
# scale = "none",
# col = bluered(100), margin=c(6, 6), key=F,
# trace = "none", density.info = "none")
i.emo.means.var <- colMeans(sentence.list%>%
filter(File%in%unlist(TermGroup[2]),Term==1)%>%
select(anger:trust)>0.01)
ii.emo.means.var <- colMeans(sentence.list%>%
filter(File%in%unlist(TermGroup[2]),Term==2)%>%
select(anger:trust)>0.01)
barplot(i.emo.means.var[order(i.emo.means.var)],
las=2, col=brewer.pal(8,"Pastel2")[order(i.emo.means.var)],
horiz=T, main="First Term", xlab = "Average Value of Emotions")
barplot(ii.emo.means.var[order(ii.emo.means.var)],
las=2, col=brewer.pal(8,"Pastel2")[order(ii.emo.means.var)],
horiz=T, main="Second Term", xlab = "Average Value of Emotions")
Trying to better understand the difference, we have indentified the most influential sentences that drove the fear and sadness level for both terms.
i.emo.var <- sentence.list%>%
filter(File%in%unlist(TermGroup[2]),Term==1)%>%
select(sentences, fear,sadness)
ii.emo.var <- sentence.list%>%
filter(File%in%unlist(TermGroup[2]),Term==2)%>%
select(sentences, fear,sadness)
i.fear <- head(i.emo.var%>%arrange(desc(fear))%>%select(sentences))
ii.fear <- head(ii.emo.var%>%arrange(desc(fear))%>%select(sentences))
i.sadness <- head(i.emo.var%>%arrange(desc(sadness))%>%select(sentences))
ii.dadness <- head(ii.emo.var%>%arrange(desc(sadness))%>%select(sentences))
i.emo.df <- cbind(i.fear,ii.fear,i.sadness,ii.dadness)
colnames(i.emo.df) <- c("First Term Fear","Second Term Fear",
"Fist Term Sadness", "Second Term Sadness")
i.emo.df
Below are the emotionally charged sentences of each of these three groups.
# What are the emotionally charged sentences?
Sentimental.Sentence.Searcher(sentence.list,TermGroup,c("1","2","3","4"))
One Term
anger "Crime is increasing."
anticipation "God bless you."
disgust "we do not hate;"
fear "We know that self-government is difficult."
joy "God bless you."
sadness "There are the homeless, lost and roaming."
surprise "in all things, generosity."
trust "God bless you."
Two Terms
anger "They are inconvenient."
anticipation "to the elevation of labor;"
disgust "With riches has come inexcusable waste."
fear "government is the problem."
joy "to the elevation of labor;"
sadness "Dark pictures and gloomy forebodings are worse than useless."
surprise "Hope maketh not ashamed."
trust "freedom of religion;"
Four Terms
anger "Happiness lies not in the mere possession of money;"
anticipation "We shall strive for perfection."
disgust "Yet our distress comes from no failure of substance."
fear "These are the lines of attack."
joy "Have we found our happy valley?"
sadness "We are stricken by no plague of locusts."
surprise "power to do good."
trust "Have we found our happy valley?"
For the POTUS who have served two terms, below are the emotionally charged sentences of their first term speeches.
Sentimental.Sentence.Searcher(sentence.list,TermGroup[2],c("1"))
Two Terms
anger "Americans deserve better."
anticipation "God bless you, and thank you."
disgust "With riches has come inexcusable waste."
fear "government is the problem."
joy "God bless you, and thank you."
sadness "Unanimity is impossible."
surprise "I know America's youth."
trust "freedom of religion;"
And below listed the emotionally charged sentences of their second term speeches.
Sentimental.Sentence.Searcher(sentence.list,TermGroup[2],c("2"))
Two Terms
anger "They are inconvenient."
anticipation "to the elevation of labor;"
disgust "They are inconvenient."
fear "They fuel the fanaticism of terror."
joy "to the elevation of labor;"
sadness "Dark pictures and gloomy forebodings are worse than useless."
surprise "Hope maketh not ashamed."
trust "to the elevation of labor;"
We perform the k-means clustering for the two groups, POTUS who served one term or two terms. The results can be visualized as shown below.
par(mar=c(1,1,1,1), mfrow=c(1,2))
# One Term
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,TermGroup[1],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
# Two Terms
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,TermGroup[2],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
For POTUS who served two terms, it is interesting to see there are a lot differeces in the clustering results between their first term and second term, as shown below.
par(mar=c(4.5,5.5,2,1), mfrow=c(2,1))
# First Term
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,TermGroup[2],c("1"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
# Second Terms
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,TermGroup[2],c("2"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
| Political Party | Number of Presidents |
|---|---|
| Republican | 17 |
| Democratic | 14 |
### Political Party
# Summary of groups by politial party
inaug.list%>%
group_by(Party)%>%
summarise(Count = n_distinct(President))
# Identify 'File' for each group
Democrats <- inaug.list$File[inaug.list$Party=="Democratic"]
Republicans <- inaug.list$File[inaug.list$Party=="Republican"]
# Create group and group name
PartyGroup <- list("Republicans"=Republicans, "Democrats"=Democrats)
#PartyGroup.name <- "Political Party"
31 POTUS are assigned into 2 groups by the political party they served, i.e., Republican vs. Democratic. The rest of the POTUS will not be in scope of the analysis in terms of the political party they served.
PartyGroup$Republicans
[1] "AbrahamLincoln" "AbrahamLincoln" "UlyssesSGrant" "UlyssesSGrant"
[5] "RutherfordBHayes" "JamesGarfield" "BenjaminHarrison" "WilliamMcKinley"
[9] "WilliamMcKinley" "TheodoreRoosevelt" "WilliamHowardTaft" "WarrenGHarding"
[13] "CalvinCoolidge" "HerbertHoover" "DwightDEisenhower" "DwightDEisenhower"
[17] "RichardNixon" "RichardNixon" "RonaldReagan" "RonaldReagan"
[21] "GeorgeBush" "GeorgeWBush" "GeorgeWBush" "DonaldJTrump"
PartyGroup$Democrats
[1] "AndrewJackson" "AndrewJackson" "MartinvanBuren" "JamesKPolk"
[5] "FranklinPierce" "JamesBuchanan" "GroverCleveland" "GroverCleveland"
[9] "WoodrowWilson" "WoodrowWilson" "FranklinDRoosevelt" "FranklinDRoosevelt"
[13] "FranklinDRoosevelt" "FranklinDRoosevelt" "HarrySTruman" "JohnFKennedy"
[17] "LyndonBJohnson" "JimmyCarter" "WilliamJClinton" "WilliamJClinton"
[21] "BarackObama" "BarackObama"
par(mar=c(4.5,3,2,3), mfrow=c(1,2))
# Create bar plot of each speech
Speech.Length.Bar.Plot(inaug.list,PartyGroup)
# Create box plot of each speech by group
Speech.Length.Box.Plot(inaug.list,PartyGroup)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,2))
Bee.Swarm.Plot(sentence.list,PartyGroup)
Bee.Swarm.Plot.Group(sentence.list,PartyGroup)
Below listed the shortest and longest sentenses in the speech.
Sentence.Searcher.Group(sentence.list,PartyGroup,c("1","2","3","4"),FALSE)
Sentence.Searcher.Group(sentence.list,PartyGroup,c("1","2","3","4"),TRUE)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,length(PartyGroup)))
set.seed(123)
Word.Cloud.Group(inaug.list,PartyGroup)
As shown, the fear level of speeches of Democrats is higher than that of Republicans, while all other emotions seemed to be at the same level as them.
par(mar=c(4.5,5.5,2,1), mfrow=c(length(PartyGroup),1))
# Create barplot of average value of emotions for each group
Sentimental.Analysis.Plots(sentence.list,PartyGroup)
We would like to look into it closer and try to indentify the most influced sentences that drove the difference.
i.emo.var <- sentence.list%>%
filter(File%in%unlist(PartyGroup[1]))%>%
select(sentences, fear)
ii.emo.var <- sentence.list%>%
filter(File%in%unlist(PartyGroup[2]))%>%
select(sentences, fear)
i.fear <- head(i.emo.var%>%arrange(desc(fear))%>%select(sentences))
ii.fear <- head(ii.emo.var%>%arrange(desc(fear))%>%select(sentences))
i.emo.df <- cbind(i.fear,ii.fear)
colnames(i.emo.df) <- c("Republicans Fear","Democrats Fear")
i.emo.df
Below are the emotionally charged sentences of each of these two groups.
# What are the emotionally charged sentences?
Sentimental.Sentence.Searcher(sentence.list,PartyGroup,c("1","2","3","4"))
We perform the k-means clustering for the two groups. The results can be visualized as shown below.
par(mar=c(4.5,5.5,2,1), mfrow=c(2,1))
# Reputlican
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,PartyGroup[1],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
# Democratic
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,PartyGroup[2],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
| Educational Background | Number of Presidents |
|---|---|
| No Colledge | 9 |
| Undergraduate | 21 |
| Advanced Degree | 7 |
Educational backgound of POTUS were obtained from the following websites:
### Educational Background
# Indentify educational background of each POTUS
NoColledge <- c("GeorgeWashington", "JamesMonroe", "AndrewJackson",
"MartinvanBuren", "ZacharyTaylor", "AbrahamLincoln",
"GroverCleveland", "WilliamMcKinley", "HarrySTruman")
Undergrad <- c("ThomasJefferson","JamesMadison", "JohnQuincyAdams",
"JamesKPolk", "FranklinPierce", "JamesBuchanan",
"UlyssesSGrant", "JamesGarfield", "BenjaminHarrison",
"TheodoreRoosevelt", "WarrenGHarding", "CalvinCoolidge",
"HerbertHoover", "FranklinDRoosevelt", "DwightDEisenhower",
"JohnFKennedy", "LyndonBJohnson", "JimmyCarter",
"RonaldReagan", "GeorgeBush", "DonaldJTrump")
MA.MS <- c("JohnAdams")
MBA <- c("GeorgeWBush")
Law <- c("RutherfordBHayes", "RichardNixon", "WilliamJClinton", "BarackObama")
Doctorate <- c("WoodrowWilson")
Advanced.Degree <- c(MA.MS, MBA, Law, Doctorate)
# Group POTUS into three categories and Create group and group name
EducationGroup <- list("No Colledge"=NoColledge,
"Undergraduate"=Undergrad,
"Advanced Degree"=Advanced.Degree)
#EducationGroup.name <- "Educational Background"
#summary(EducationGroup)[,1]
POTUS are assigned into three groups based on their highest education level.
EducationGroup$`No Colledge`
EducationGroup$Undergraduate
EducationGroup$`Advanced Degree`
par(mar=c(4.5,3,2,3), mfrow=c(1,2))
# Create bar plot of each speech
Speech.Length.Bar.Plot(inaug.list,EducationGroup)
# Create box plot of each speech by group
Speech.Length.Box.Plot(inaug.list,EducationGroup)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,2))
Bee.Swarm.Plot(sentence.list,EducationGroup)
Bee.Swarm.Plot.Group(sentence.list,EducationGroup)
Below listed the shortest and longest sentenses in the speech.
Sentence.Searcher.Group(sentence.list,EducationGroup,c("1","2","3","4"),FALSE)
Sentence.Searcher.Group(sentence.list,EducationGroup,c("1","2","3","4"),TRUE)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,length(EducationGroup)))
set.seed(123)
Word.Cloud.Group(inaug.list,EducationGroup)
par(mar=c(4.5,5.5,2,1), mfrow=c(length(EducationGroup),1))
# Create barplot of average value of emotions for each group
Sentimental.Analysis.Plots(sentence.list,EducationGroup)
Below are the emotionally charged sentences of each of these two groups.
# What are the emotionally charged sentences?
Sentimental.Sentence.Searcher(sentence.list,EducationGroup,c("1","2","3","4"))
We performed the k-means clustering for the two groups. The results can be visualized as shown below.
par(mar=c(4.5,5.5,2,1), mfrow=c(2,1))
# No Colledge Degree
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,EducationGroup[1],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
# Undergraduate Degree Only
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,EducationGroup[2],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
# Advenced Degree
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,EducationGroup[3],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
| Career Prior to Politics | Number of Presidents |
|---|---|
| Lawyer | 21 |
| Military Leader | 3 |
| Farmer | 3 |
| Educator | 2 |
| Businessperson | 3 |
Career backgound of POTUS were obtained from the following website:
### Career Prior to Politics
# Indentify career background of each POTUS
lawyer <- c("JohnAdams", "ThomasJefferson", "JamesMadison",
"JamesMonroe", "JohnQuincyAdams", "AndrewJackson",
"MartinvanBuren", "JamesKPolk", "FranklinPierce",
"JamesBuchanan", "AbrahamLincoln", "RutherfordBHayes",
"JamesGarfield", "GroverCleveland", "BenjaminHarrison",
"WilliamMcKinley", "CalvinCoolidge", "FranklinDRoosevelt",
"RichardNixon", "WilliamJClinton", "BarackObama")
military.leader <- c("ZacharyTaylor", "UlyssesSGrant", "DwightDEisenhower")
farmer <- c("GeorgeWashington", "HarrySTruman", "JimmyCarter")
businessperson <- c("GeorgeBush", "GeorgeWBush", "DonaldJTrump")
educator <- c("WoodrowWilson", "LyndonBJohnson")
# Group POTUS into five categories and Create group and group name
CareerGroup <- list("Lawyer"=lawyer,
"Military Learder"=military.leader,
"Farmer"=farmer,
"Educator"=educator,
"Businessperson"=businessperson)
#CareerGroup.name <- "Career Prior to Politics"
#summary(CareerGroup)[,1]
POTUS are assigned into five groups based on their career pior to politics.
CareerGroup$Lawyer
CareerGroup$`Military Learder`
CareerGroup$Farmer
CareerGroup$Educator
CareerGroup$Businessperson
par(mar=c(4.5,3,2,3), mfrow=c(1,2))
# Create bar plot of each speech
Speech.Length.Bar.Plot(inaug.list,CareerGroup)
# Create box plot of each speech by group
Speech.Length.Box.Plot(inaug.list,CareerGroup)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,2))
Bee.Swarm.Plot(sentence.list,CareerGroup)
Bee.Swarm.Plot.Group(sentence.list,CareerGroup)
Below listed the shortest and longest sentenses in the speech.
Sentence.Searcher.Group(sentence.list,CareerGroup,c("1","2","3","4"),FALSE)
Sentence.Searcher.Group(sentence.list,CareerGroup,c("1","2","3","4"),TRUE)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,length(CareerGroup)))
set.seed(123)
Word.Cloud.Group(inaug.list,CareerGroup)
par(mar=c(4.5,5.5,2,1), mfrow=c(length(CareerGroup),1))
# Create barplot of average value of emotions for each group
Sentimental.Analysis.Plots(sentence.list,CareerGroup)
Below are the emotionally charged sentences of each of these groups.
# What are the emotionally charged sentences?
Sentimental.Sentence.Searcher(sentence.list,CareerGroup,c("1","2","3","4"))
We performed the k-means clustering of the Lawyer group, as size of other groups are quite small.
par(mar=c(4.5,5.5,2,1), mfrow=c(2,1))
# Lawyer
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,CareerGroup[1],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
| Served During War Era? | Number of Presidents |
|---|---|
| No War Era | 38 |
| War Era | 11 |
The following website was used as a reference to understand more about the U.S. history. It presents a timeline of the U.S. hitory, which specifies the wars the U.S. was involded in.
POTUS are assigned into two groups based on whether it was war era while they were serving.
11 POTUS who served during war eras include: James Madison(War of 1812), James Polk(Mexican War), Abraham Lincoln(Civil War), William McKinley(Spanish-American War), Woodrow Wilson(WWI), Franklin D. Roosevelt(WWII), Harry S. Truman(Korean War), Lyndon Johnson(Vietam War), Richard Nixon(Vietnam War), George Bush(Gulf War), George W. Bush(War on Terror).
The rest 38 POTUS were serving not during war era.
### Served During War Era?
# Indentify POTUS who served during war eras
war.era <- c("JamesMadison", "JamesKPolk", "AbrahamLincoln",
"WilliamMcKinley", "WoodrowWilson", "FranklinDRoosevelt",
"HarrySTruman", "LyndonBJohnson", "RichardNixon",
"GeorgeBush", "GeorgeWBush")
nowar.era <- inaug.list$File[!(inaug.list$File%in%war.era)]
# Create group and group name
WarGroup <- list("No War Era"=nowar.era, "War Era"=war.era)
#WarGroup.name <- "Whether Served During War Era"
summary(WarGroup)[,1]
par(mar=c(4.5,3,2,3), mfrow=c(1,2))
# Create bar plot of each speech
Speech.Length.Bar.Plot(inaug.list,WarGroup)
# Create box plot of each speech by group
Speech.Length.Box.Plot(inaug.list,WarGroup)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,2))
Bee.Swarm.Plot(sentence.list,WarGroup)
Bee.Swarm.Plot.Group(sentence.list,WarGroup)
Below listed the shortest and longest sentenses in the speech.
Sentence.Searcher.Group(sentence.list,WarGroup,c("1","2","3","4"),FALSE)
Sentence.Searcher.Group(sentence.list,WarGroup,c("1","2","3","4"),TRUE)
par(mar=c(4.5,5.5,2,1), mfrow=c(1,length(WarGroup)))
set.seed(123)
Word.Cloud.Group(inaug.list,WarGroup)
par(mar=c(4.5,5.5,2,1), mfrow=c(length(WarGroup),1))
# Create barplot of average value of emotions for each group
Sentimental.Analysis.Plots(sentence.list,WarGroup)
Below are the emotionally charged sentences of each of these two groups.
# What are the emotionally charged sentences?
Sentimental.Sentence.Searcher(sentence.list,WarGroup,c("1","2","3","4"))
We performed the k-means clustering for the two groups. The results can be visualized as shown below.
par(mar=c(4.5,5.5,2,1), mfrow=c(2,1))
# No War Era
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,WarGroup[1],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)
# War Era
## Perform k-means clustering
clustering.rst <- Sentimental.Sentence.KMeans(sentence.list,WarGroup[2],c("1","2","3","4"))
## Visualze clustering results
fviz_cluster(clustering.rst[[1]],
stand=F, repel= TRUE,
data = clustering.rst[[2]][,-1],
xlab="", xaxt="n",
show.clust.cent=FALSE)